import numpy as np
import pandas as pd
np.random.seed(1)

n = 20
ID = np.arange(1,n+1)
SCORE = np.random.normal(80,10,n).astype('int')
df = pd.DataFrame({'ID':ID,'SCORE':SCORE})


import numpy as np
import pandas as pd
np.random.seed(1)

n = 20
ID = np.arange(1,n+1)
SCORE = np.random.normal(80,10,n).astype('int')
df = pd.DataFrame({'ID':ID,'SCORE':SCORE})

df


print(df.to_markdown())

|    |   ID |   SCORE |
|---:|-----:|--------:|
|  0 |    1 |      96 |
|  1 |    2 |      73 |
|  2 |    3 |      74 |
|  3 |    4 |      69 |
|  4 |    5 |      88 |
|  5 |    6 |      56 |
|  6 |    7 |      97 |
|  7 |    8 |      72 |
|  8 |    9 |      83 |
|  9 |   10 |      77 |
| 10 |   11 |      94 |
| 11 |   12 |      59 |
| 12 |   13 |      76 |
| 13 |   14 |      76 |
| 14 |   15 |      91 |
| 15 |   16 |      69 |
| 16 |   17 |      78 |
| 17 |   18 |      71 |
| 18 |   19 |      80 |
| 19 |   20 |      85 |


from sklearn.preprocessing import KBinsDiscretizer
KBinsDiscretizer(n_bins=5, 
                 encode='onehot',
                 strategy='quantile',
                 dtype=None)

KBinsDiscretizer()


score= df['SCORE'].values.reshape(-1,1)
score

array([[96],
       [73],
       [74],
       [69],
       [88],
       [56],
       [97],
       [72],
       [83],
       [77],
       [94],
       [59],
       [76],
       [76],
       [91],
       [69],
       [78],
       [71],
       [80],
       [85]])


from sklearn.preprocessing import KBinsDiscretizer

dis = KBinsDiscretizer(n_bins=3,
                       encode="ordinal",
                       strategy="uniform"
                      )
label_uniform = dis.fit_transform(score)  # 转换器


label_uniform

array([[2.],
       [1.],
       [1.],
       [0.],
       [2.],
       [0.],
       [2.],
       [1.],
       [1.],
       [1.],
       [2.],
       [0.],
       [1.],
       [1.],
       [2.],
       [0.],
       [1.],
       [1.],
       [1.],
       [2.]])


dis.bin_edges_

array([array([56.        , 69.66666667, 83.33333333, 97.        ])],
      dtype=object)


dis = KBinsDiscretizer(n_bins=3,
                       encode="ordinal",
                       strategy="quantile"
                      )

label_quantile = dis.fit_transform(score)


dis = KBinsDiscretizer(n_bins=3,
                       encode="ordinal",
                       strategy="kmeans"
                      )

label_kmeans = dis.fit_transform(score)  # 转换器

D:\softwares\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:1036: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.
  warnings.warn(


label_kmeans

array([[2.],
       [1.],
       [1.],
       [0.],
       [2.],
       [0.],
       [2.],
       [1.],
       [1.],
       [1.],
       [2.],
       [0.],
       [1.],
       [1.],
       [2.],
       [0.],
       [1.],
       [1.],
       [1.],
       [2.]])


df["label_uniform"] = label_uniform
df["label_quantile"] = label_quantile
df["label_kmeans"] = label_kmeans

df

	ID	SCORE	label_uniform	label_quantile	label_kmeans
0	1	96	2.0	2.0	2.0
1	2	73	1.0	0.0	1.0
2	3	74	1.0	1.0	1.0
3	4	69	0.0	0.0	0.0
4	5	88	2.0	2.0	2.0
5	6	56	0.0	0.0	0.0
6	7	97	2.0	2.0	2.0
7	8	72	1.0	0.0	1.0
8	9	83	1.0	2.0	1.0
9	10	77	1.0	1.0	1.0
10	11	94	2.0	2.0	2.0
11	12	59	0.0	0.0	0.0
12	13	76	1.0	1.0	1.0
13	14	76	1.0	1.0	1.0
14	15	91	2.0	2.0	2.0
15	16	69	0.0	0.0	0.0
16	17	78	1.0	1.0	1.0
17	18	71	1.0	0.0	1.0
18	19	80	1.0	1.0	1.0
19	20	85	2.0	2.0	2.0

等宽分箱¶

等频分箱¶

聚类分箱¶

对比¶

	ID	SCORE
0	1	96
1	2	73
2	3	74
3	4	69
4	5	88
5	6	56
6	7	97
7	8	72
8	9	83
9	10	77
10	11	94
11	12	59
12	13	76
13	14	76
14	15	91
15	16	69
16	17	78
17	18	71
18	19	80
19	20	85

	ID	SCORE
0	1	96
1	2	73
2	3	74
3	4	69
4	5	88
5	6	56
6	7	97
7	8	72
8	9	83
9	10	77
10	11	94
11	12	59
12	13	76
13	14	76
14	15	91
15	16	69
16	17	78
17	18	71
18	19	80
19	20	85

	ID	SCORE	label_uniform	label_quantile	label_kmeans
0	1	96	2.0	2.0	2.0
1	2	73	1.0	0.0	1.0
2	3	74	1.0	1.0	1.0
3	4	69	0.0	0.0	0.0
4	5	88	2.0	2.0	2.0
5	6	56	0.0	0.0	0.0
6	7	97	2.0	2.0	2.0
7	8	72	1.0	0.0	1.0
8	9	83	1.0	2.0	1.0
9	10	77	1.0	1.0	1.0
10	11	94	2.0	2.0	2.0
11	12	59	0.0	0.0	0.0
12	13	76	1.0	1.0	1.0
13	14	76	1.0	1.0	1.0
14	15	91	2.0	2.0	2.0
15	16	69	0.0	0.0	0.0
16	17	78	1.0	1.0	1.0
17	18	71	1.0	0.0	1.0
18	19	80	1.0	1.0	1.0
19	20	85	2.0	2.0	2.0

	ID	SCORE
0	1	96
1	2	73
2	3	74
3	4	69
4	5	88
5	6	56
6	7	97
7	8	72
8	9	83
9	10	77
10	11	94
11	12	59
12	13	76
13	14	76
14	15	91
15	16	69
16	17	78
17	18	71
18	19	80
19	20	85

	ID	SCORE
0	1	96
1	2	73
2	3	74
3	4	69
4	5	88
5	6	56
6	7	97
7	8	72
8	9	83
9	10	77
10	11	94
11	12	59
12	13	76
13	14	76
14	15	91
15	16	69
16	17	78
17	18	71
18	19	80
19	20	85

	ID	SCORE	label_uniform	label_quantile	label_kmeans
0	1	96	2.0	2.0	2.0
1	2	73	1.0	0.0	1.0
2	3	74	1.0	1.0	1.0
3	4	69	0.0	0.0	0.0
4	5	88	2.0	2.0	2.0
5	6	56	0.0	0.0	0.0
6	7	97	2.0	2.0	2.0
7	8	72	1.0	0.0	1.0
8	9	83	1.0	2.0	1.0
9	10	77	1.0	1.0	1.0
10	11	94	2.0	2.0	2.0
11	12	59	0.0	0.0	0.0
12	13	76	1.0	1.0	1.0
13	14	76	1.0	1.0	1.0
14	15	91	2.0	2.0	2.0
15	16	69	0.0	0.0	0.0
16	17	78	1.0	1.0	1.0
17	18	71	1.0	0.0	1.0
18	19	80	1.0	1.0	1.0
19	20	85	2.0	2.0	2.0

	ID	SCORE
0	1	96
1	2	73
2	3	74
3	4	69
4	5	88
5	6	56
6	7	97
7	8	72
8	9	83
9	10	77
10	11	94
11	12	59
12	13	76
13	14	76
14	15	91
15	16	69
16	17	78
17	18	71
18	19	80
19	20	85

	ID	SCORE
0	1	96
1	2	73
2	3	74
3	4	69
4	5	88
5	6	56
6	7	97
7	8	72
8	9	83
9	10	77
10	11	94
11	12	59
12	13	76
13	14	76
14	15	91
15	16	69
16	17	78
17	18	71
18	19	80
19	20	85

	ID	SCORE	label_uniform	label_quantile	label_kmeans
0	1	96	2.0	2.0	2.0
1	2	73	1.0	0.0	1.0
2	3	74	1.0	1.0	1.0
3	4	69	0.0	0.0	0.0
4	5	88	2.0	2.0	2.0
5	6	56	0.0	0.0	0.0
6	7	97	2.0	2.0	2.0
7	8	72	1.0	0.0	1.0
8	9	83	1.0	2.0	1.0
9	10	77	1.0	1.0	1.0
10	11	94	2.0	2.0	2.0
11	12	59	0.0	0.0	0.0
12	13	76	1.0	1.0	1.0
13	14	76	1.0	1.0	1.0
14	15	91	2.0	2.0	2.0
15	16	69	0.0	0.0	0.0
16	17	78	1.0	1.0	1.0
17	18	71	1.0	0.0	1.0
18	19	80	1.0	1.0	1.0
19	20	85	2.0	2.0	2.0